In [3]:
import sqlite3
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
<ul class="list_follow">객체에서
a tag href="/@"으로 시작하는 아이디 가져오기
In [4]:
# brunch data crawling by 셀레니엄
# source reference : http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
def crawlBrunchLink(uid, dir='follower', driver=webdriver.Firefox()):
## html crawling
url = "https://brunch.co.kr/@{uid}/{dir}".format(uid=uid, dir=dir)
driver.get(url)
htmlsize = 0
keep_cnt = 0
for i in range(1,200):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.003)
if htmlsize == len(driver.page_source):
keep_cnt += 1
else :
keep_cnt = 0
htmlsize = len(driver.page_source)
if keep_cnt > 5 :
break
html_source = driver.page_source
## extract follower, following data
data = html_source.encode('utf-8')
soup = BeautifulSoup(data, 'html.parser')
classes = soup.find_all("a", class_="link_follow")
idlist = []
for c in classes:
follwing = c.get('href')
if follwing is None or len(follwing)==0:
continue
idlist.append(follwing[2:])
#driver.close()
return idlist
In [5]:
## 관심 작가 정보 크롤링 하기
## extract Brunch Writer Info : uid, name, text-count, megazine-count, follower-count, following-count:
def extractWriterInfo(uid):
try:
response = requests.get("http://brunch.co.kr/@{uid}".format(uid=uid) )
except Exception:
[]
data = response.content.decode('utf-8')
soup = BeautifulSoup(data, 'html.parser')
## name
names = soup.find_all("strong", class_="profileUserName")
name = uid if len(names)<1 else names[0].getText()
## profile description
desc = soup.find_all("pre", class_="profileUserDesc")
desc = "{} 브런치입니다.".format(uid) if len(desc)<1 else desc[0].getText()
## thumbnail image link
imgsrc = soup.find_all("input", class_="profileUserImageUrl")
imgsrc = "no-img" if len(imgsrc)<1 else imgsrc[0].get('value')
classes = soup.find_all("span", class_="num_count")
reserved = [uid, name, desc, imgsrc]
for c in classes:
reserved.append(int(c.getText().replace(",","")))
if len(reserved) < 8:
for n in range(0,8-(len(reserved))):
reserved.append(0)
return reserved[:8]
In [6]:
def insertData(tbl_name, columns, rows):
conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
col_str = ", ".join(columns)
val_str = ", ".join(['?' for n in columns])
sql = "insert into {tbl} ({cols}) values ({vals}) ".format(tbl=tbl_name, cols=col_str, vals=val_str)
try:
ret = conn.executemany(sql, rows)
except Exception:
conn.rollback()
conn.commit()
conn.close()
### sample code
#now = (int(time.time()))
#rows = [['goodvc78', 'test1', now ],['goodvc78', 'test2', now ]]
#insertData( 'follower_tbl', ['writerid', 'userid', 'tm'], rows)
In [7]:
def insertFollowings(base, id_list):
now = (int(time.time()))
rows = [[base, uid, now] for uid in id_list]
insertData( 'following_tbl', ['userid', 'writerid', 'tm'], rows)
def insertFollowers(base, id_list):
now = (int(time.time()))
rows = [[base, uid, now] for uid in id_list]
insertData( 'follower_tbl', ['writerid','userid', 'tm'], rows)
def insertWriterInfo(writer_info_list):
now = (int(time.time()))
rows = []
for info in writer_info_list:
info.append(now)
rows.append(info)
colnames = ['writerid', 'name', 'profile', 'imgsrc', 'documents', 'megazines', 'followers', 'followings', 'tm']
insertData( 'writer_info_tbl', colnames, rows)
In [6]:
## 1. goodvc78's follower crawling
base = 'goodvc78'
driver = webdriver.Firefox()
base_follower = crawlBrunchLink(base, dir='follower', driver=driver)
print ("내가 좋아하는 작가의 followers = %d" % len(base_follower) )
driver.close()
In [52]:
## 2. goodvc78 follower list insert
insertFollowers(base, base_follower)
In [8]:
def unreadUserid(limit=100):
conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
sql = """
select userid from follower_tbl
where userid not in ( select userid from following_tbl) limit {0};""".format(limit)
ds = pd.read_sql(sql, conn)
conn.close()
return ds.userid.tolist()
In [9]:
def unreadWriterid(limit=10):
conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
sql = """
select writerid,count(1) cnt from following_tbl
where writerid not in ( select writerid from follower_tbl) and writerid !='brunch'
group by writerid
having cnt > 20
limit {0};""".format(limit)
ds = pd.read_sql(sql, conn)
conn.close()
return ds.writerid.tolist()
In [10]:
def unreadWriterInfoid(limit=100):
conn = sqlite3.connect('/Users/goodvc/Documents/data/sqllite/brunch_db.db')
sql = """
select writerid, count(1) cnt from following_tbl
where writerid not in ( select writerid from writer_info_tbl) and writerid !='brunch'
group by writerid
having cnt > 1
limit {0};""".format(limit)
ds = pd.read_sql(sql, conn)
conn.close()
return ds.writerid.tolist()
In [11]:
def crawlFollowing(limit=100):
driver = webdriver.Firefox()
users = unreadUserid(limit)
print ("\ncrawling users ", len(users))
for uid in users :
following = crawlBrunchLink(uid, dir='following', driver=driver)
insertFollowings(uid, following)
print('.',end="")
driver.close()
In [12]:
def crawlFollower(limit=10):
driver = webdriver.Firefox()
writers = unreadWriterid(limit)
print ("\ncrawling writers ", len(writers))
for writerid in writers :
follower = crawlBrunchLink(writerid, dir='follower', driver=driver)
insertFollowers(writerid, follower)
print('.',end="")
driver.close()
In [13]:
def crawlWriterInfo(limit=100):
writers = unreadWriterInfoid(limit)
print ("\ncrawling writer info ", len(writers))
infos = []
for writerid in writers :
info = extractWriterInfo(writerid)
if len(info)!=8:
print("skipped:{id} {val}".format(id=writerid, val=info))
continue
infos.append(info)
print('.',end="")
insertWriterInfo(infos)
In [14]:
## following list crawling
for n in range(1,10):
crawlFollowing(100)
In [20]:
crawlFollower(100)
In [21]:
## following list crawling
for n in range(1,100):
crawlFollowing(100)
In [ ]:
## writer info crawling
for n in range(1,2):
crawlWriterInfo(100)
In [ ]:
## following list crawling
for n in range(1,100):
crawlFollowing(100)
In [19]:
crawlFollowing(100)
In [ ]: